/**
 * @brief 测试opencv并行计算
 * @see
 * https://docs.opencv.org/4.x/dc/ddf/tutorial_how_to_use_OpenCV_parallel_for_new.html
 */

#define FMT_HEADER_ONLY
#include <fmt/core.h>
#include <fmt/format.h>

#include <opencv2/core/utility.hpp>
#include <opencv2/opencv.hpp>
#include <utility>

#include "env.h"

using cv::getTickCount;
using cv::getTickFrequency;
using cv::Mat;

/**
 * @brief 并行卷积计算
 * @param[in] src 输入灰度图
 * @param[out] dst 卷积结果
 * @param[in] kernel 卷积核
 */
void conv_seq(Mat src, Mat& dst, const Mat& kernel) {
    CV_Assert(src.channels() == 1);

    int rows = src.rows;
    int cols = src.cols;
    dst      = Mat(rows, cols, src.type());

    int sz = kernel.rows / 2;
    // 填充src图像的边界，方便卷积计算不用对边界进行特殊处理
    copyMakeBorder(src, src, sz, sz, sz, sz, cv::BORDER_REPLICATE);

    for (int i = 0; i < rows; ++i) {
        // 获取dst矩阵的行指针
        auto* dstPtr = dst.ptr(i);
        for (int j = 0; j < cols; ++j) {
            int value = 0;
            // 从卷积核左上角开始访问，以3x3的卷积核为例，将中心点作为原点
            // (-1 -1) (0 -1) (1 -1)
            // (-1 0) (0 0) (1 0)
            // (-1 1) (0 1) (1 1)
            for (int k = -sz; k <= sz; k++) {
                auto* srcPtr = src.ptr(i + sz + k);
                for (int l = -sz; l <= sz; ++l) {
                    // print("kernel: {}\t", kernel.ptr<int>(k + sz)[l + sz]);
                    // !kernel使用的char型进行存储,原文这里使用double进行读取,解析为浮点极小值,导致最终全是0
                    // !如果省略类型,则会按照uchar进行解释,负值溢出导致结果出错
                    value += kernel.ptr<char>(k + sz)[l + sz] * srcPtr[j + sz + l];
                }
            }
            dstPtr[j] = cv::saturate_cast<uchar>(value);
        }
    }
}

class parallelConvlution : public cv::ParallelLoopBody {
private:
    Mat  m_src;
    Mat& m_dst;
    Mat  m_kernel;
    int  m_sz;
    int  m_dstCols;

public:
    explicit parallelConvlution(Mat src, Mat& dst, const Mat& kernel)
        : m_src(std::move(src)), m_dst(dst), m_kernel(kernel) {
        m_sz      = kernel.rows / 2;
        m_dstCols = m_src.cols;
        copyMakeBorder(m_src, m_src, m_sz, m_sz, m_sz, m_sz, cv::BORDER_REPLICATE);
    }

    virtual void operator()(const cv::Range& range) const CV_OVERRIDE {
        for (int r = range.start; r < range.end; r++) {
            // src存储在一块连续的线性内存上，那么r就是单个元素在该线性空间的序号
            // 所以i,j就分别表示该元素在二位矩阵空间的行和列
            int i = r / m_dstCols;
            int j = r % m_dstCols;

            int value = 0;

            for (int k = -m_sz; k <= m_sz; k++) {
                const auto* srcPtr = m_src.ptr(i + m_sz + k);
                for (int l = -m_sz; l <= m_sz; ++l) {
                    value += m_kernel.ptr<char>(k + m_sz)[l + m_sz] * srcPtr[j + m_sz + l];
                }
            }
            m_dst.ptr(i)[j] = cv::saturate_cast<uchar>(value);
        }
    }
};

int main(void) {
    auto test_file = kResourcesDir + "test01.jpg";
    auto src       = imread(test_file, cv::IMREAD_GRAYSCALE);
    // resize(src, src, Size(), 0.5, 0.5);

    /**
     * 卷积核
     * -1 0 1
     * -1 0 1
     * -1 0 1
     */
    Mat kernel = (cv::Mat_<char>(3, 3) << 0, -1, 0, -1, 4, -1, 0, -1, 0);

    Mat result;

    int64_t t       = 0;
    double  elapsed = 0;

    t = getTickCount();

    filter2D(src, result, src.depth(), kernel);

    elapsed = (getTickCount() - t) / getTickFrequency() * 1e6;

    fmt::print("filter2d elaped {} us\n", elapsed);

    t = getTickCount();

    conv_seq(src, result, kernel);

    elapsed = (getTickCount() - t) / getTickFrequency() * 1e6;

    fmt::print("conv_seq elaped {} us\n", elapsed);

    /************************************************************/

    result = Mat::zeros(src.size(), CV_8U);
    t      = getTickCount();

    cv::setNumThreads(4);
    // 在虚拟机上没有看出差异
    // 在实际项目中进行测试，重复对256x256的灰度图进行遍历，发现:
    // 1. 串行访问时间不稳定，并随着运行次数增加，平均耗时在缓慢增加
    // 2.
    // 4核并行平均耗时明显低于2核平均耗时，且随着运行时间平均耗时在逐渐降低并稳定在某值附近波动
    parallelConvlution target(src, result, kernel);
    // 在给定范围内进行并行处理
    parallel_for_(cv::Range(0, src.rows * src.cols), target);

    elapsed = (getTickCount() - t) / getTickFrequency() * 1e6;

    fmt::print("conv_parallel elapsed {} us\n", elapsed);
#if __cplusplus >= 201100L
    /************************************************************/
    result = Mat::zeros(src.size(), CV_8U);
    t      = getTickCount();

    Mat srcWithBorder;
    int sz = kernel.rows / 2;
    copyMakeBorder(src, srcWithBorder, sz, sz, sz, sz, cv::BORDER_REPLICATE);

    parallel_for_(cv::Range(0, src.rows * src.cols), [&](const cv::Range& range) {
        for (int r = range.start; r < range.end; r++) {
            int i = r / src.cols;
            int j = r % src.cols;

            int value = 0;

            for (int k = -sz; k <= sz; k++) {
                auto* srcPtr = srcWithBorder.ptr(i + sz + k);
                for (int l = -sz; l <= sz; ++l) {
                    value += kernel.ptr<char>(k + sz)[l + sz] * srcPtr[j + sz + l];
                }
            }
            result.ptr(i)[j] = cv::saturate_cast<uchar>(value);
        }
    });

    elapsed = (getTickCount() - t) / getTickFrequency() * 1e6;

    fmt::print("conv_parallel elapsed {} us\n", elapsed);
#endif
    // FileStorage out("out.yml", FileStorage::WRITE);
    // out << "result" << result;
    // out.release();

    cv::imwrite("conv.jpg", result);
}

/**
filter2d elaped 943.651 us
conv_seq elaped 37002.737 us
conv_parallel elapsed 23291.407 us
conv_parallel elapsed 27860.273 us
*/
